Socio-demographic Patterning of Microplastic Burden in the U.S.

Load in the data: years 2001-2016

  • demographics file
  • phthalates, urine file

downloaded from CDC National Health and Examination Survey, NHANES Questionnaires, Datasets, and Related Documentation

# 2001-2002
demographics01 <- read.xport(here("data","DEMO_B_2001_2002.XPT"))
phthalates01 <- read.xport(here("data","PHPYPA_B_2001_2002.XPT"))
nhanes01 <- merge(phthalates01,demographics01,all.x=T)

# 2003-2004
demographics03 <- read.xport(here("data","DEMO_C_2003_2004.XPT"))
phthalates03 <- read.xport(here("data","L24PH_C_2003_2004.XPT"))
nhanes03 <- merge(phthalates03,demographics03,all.x=T)

# 2005-2006
demographics05 <- read.xport(here("data","DEMO_D_2005_2006.XPT"))
phthalates05 <- read.xport(here("data","PHTHTE_D_2005_2006.XPT"))
nhanes05 <- merge(phthalates05,demographics05,all.x=T)

# 2007-2008
demographics07 <- read.xport(here("data","DEMO_E_2007_2008.XPT"))
phthalates07 <- read.xport(here("data","PHTHTE_E_2007_2008.XPT"))
nhanes07 <- merge(phthalates07,demographics07,all.x=T)

# 2009-2010
demographics09 <- read.xport(here("data","DEMO_F_2009_2010.XPT"))
phthalates09 <- read.xport(here("data","PHTHTE_F_2009_2010.XPT"))
nhanes09 <- merge(phthalates09,demographics09,all.x=T)

# 2011-2012
demographics11 <- read.xport(here("data","DEMO_G_2011_2012.XPT"))
phthalates11 <- read.xport(here("data","PHTHTE_G_2011_2012.XPT"))
nhanes11 <- merge(phthalates11,demographics11,all.x=T)

# 2013-2014
demographics13 <- read.xport(here("data","DEMO_H_2013_2014.XPT"))
phthalates13 <- read.xport(here("data","PHTHTE_H_2013_2014.XPT"))
nhanes13 <- merge(phthalates13,demographics13,all.x=T)

# 2015-2016
demographics15 <- read.xport(here("data","DEMO_I_2015_2016.XPT"))
phthalates15 <- read.xport(here("data","PHTHTE_I_2015_2016.XPT"))
nhanes15 <- merge(phthalates15,demographics15,all.x=T)

## add a column with the year to each data file:

nhanes01$year <- 2001
nhanes03$year <- 2003
nhanes05$year <- 2005
nhanes07$year <- 2007
nhanes09$year <- 2009
nhanes11$year <- 2011
nhanes13$year <- 2013
nhanes15$year <- 2015

## choose the variables to select from each data file:
var<-c('year','DMDEDUC3','DMDEDUC2','DMDHREDU','DMDHSEDU','RIDAGEYR','RIAGENDR','RIDRETH1','INDFMPIR','DMDYRSUS','DMDCITZN','URXMEP')
nhanes01s<-subset(nhanes01,select=var)
nhanes03s<-subset(nhanes03,select=var)
nhanes05s<-subset(nhanes05,select=var)
nhanes07s<-subset(nhanes07,select=var)
nhanes09s<-subset(nhanes09,select=var)
nhanes11s<-subset(nhanes11,select=var)
nhanes13s<-subset(nhanes13,select=var)
nhanes15s<-subset(nhanes15,select=var)

## create one dataset for NHANES demographics and phthalates, year 2001-2016
nhanes <- rbind(nhanes01s,nhanes03s,nhanes05s,nhanes07s,nhanes09s,nhanes11s,nhanes13s,nhanes15s)

## create a data file so that I don't need to load, clean, and select the data again
# has all demographics and URXMEP phthalate for NHANES year 2001-2016
write.csv(nhanes, "nhanes_december22.csv")

Visually inspect the data:

After inspecting the phthalate data in its raw form and the log form, we determined it is necessary to log transform the data in order to visualize it.

hist(nhanes$URXMEP)

hist(log(nhanes$URXMEP))

nhanes$log_monoethyl = log(nhanes$URXMEP)

Re-categorize the data:

Original: - year = year the observation took place *this was added by me when I merged the years of data - DMDEDUC3 = the highest grade level of education completed by participants 6-19 y.o. - DMDEDUC2 = the highest grade/level of education completed by participants 20 years and older - DMDHREDU = the household reference person’s education level - DMDHSEDU = the household reference person’s spouse’s education level - RIDAGEYR = ages in years at screening - RIAGENDR = gender - RIDRETH1 = race/Hispanic origin - INDFMPIR = ratio of family income to poverty - DMDYRSUS = length of time in U.S. - DMDCITZN = citizenship status - URXMEP = mono-ethyl phthalate (ng/mL)

Household reference person’s education level

hist(nhanes$DMDHREDU)

# add "edu_ref" to "nhanes" 
nhanes$edu_ref = factor(ifelse(nhanes$DMDHREDU %in% 1:4,"partial college and below",
                         ifelse(nhanes$DMDHREDU==5,"college and beyond",NA)),
                  levels=c("partial college and below","college and beyond"))
summary(nhanes$edu_ref)

Household reference person’s spouse’s education level

hist(nhanes$DMDHSEDU)

# add "edu_spouse_ref" to "nhanes" 
nhanes$edu_spouse_ref = factor(ifelse(nhanes$DMDHSEDU %in% 1:4,"partial college and below",
                         ifelse(nhanes$DMDHSEDU==5,"college and beyond",NA)),
                  levels=c("partial college and below","college and beyond"))
summary(nhanes$edu_spouse_ref)

Highest grade level of education completed by participants 6-19 y.o.

hist(nhanes$DMDEDUC3)

# add "edu_child_participants" to "nhanes" 
nhanes$edu_child_participants = factor(ifelse(nhanes$DMDEDUC3 %in% 0:8,"primary",
                         ifelse(nhanes$DMDEDUC3 %in% 9:15,"secondary",NA)),
                  levels=c("primary","secondary"))
summary(nhanes$edu_child_participants)

## not a good variable d/t 14,755 "refused," "don't know," "missing,

Highest grade/level of education completed by participants 20 years and older

hist(nhanes$DMDEDUC2)

# add "edu_adult_participants" to "nhanes" 
nhanes$edu_adult_participants = factor(ifelse(nhanes$DMDEDUC2 ==1, "less than 9th grade",
                                       ifelse(nhanes$DMDEDUC2 ==2, "9-11th grade",
                                       ifelse(nhanes$DMDEDUC2 ==3, "high school grad/GED", 
                                       ifelse(nhanes$DMDEDUC2 ==4, "some college or AA",
                                       ifelse(nhanes$DMDEDUC2 ==5, "college grad or above", NA))))))
summary(nhanes$edu_adult_participants)

Age in years at screening

Based on the summary statistics of the variable “RIDAGEYR”: the youngest participant is 3 years old, the oldest participant is 85 years old, and the median age is 31 years old.

hist(nhanes$RIDAGEYR)

summary(nhanes$RIDAGEYR)


# add "age" to "nhanes" 
nhanes$age = factor(ifelse(nhanes$RIDAGEYR >=65,"older adult",
                    ifelse(nhanes$RIDAGEYR >25 & nhanes$RIDAGEYR<64,"middle-aged",
                    ifelse(nhanes$RIDAGEYR >=19 & nhanes$RIDAGEYR<=25,"young adult",
                    ifelse(nhanes$RIDAGEYR <=18,"child",NA)))),
             levels=c("child","young adult", "middle-aged", "older adult"))
summary(nhanes$age)

Gender

hist(nhanes$RIAGENDR)

# add "gender" to "nhanes"
nhanes$gender = factor(ifelse(nhanes$RIAGENDR ==1, "male",
                       ifelse(nhanes$RIAGENDR ==2, "female", NA)),
                levels=c("male", "female"))
summary(nhanes$gender)

Race/ethnicity

hist(nhanes$RIDRETH1)

# add "race/ethnicity" to "nhanes"
nhanes$ethnicity = factor(ifelse(nhanes$RIDRETH1 ==1, "Mexican American",
                       ifelse(nhanes$RIDRETH1 ==2, "Other Hispanic",
                       ifelse(nhanes$RIDRETH1 ==3, "Non-Hispanic White", 
                       ifelse(nhanes$RIDRETH1 ==4, "Non-Hispanic Black",
                       ifelse(nhanes$RIDRETH1 ==5, "Other or Multi", NA))))),
                levels=c("Non-Hispanic White", "Non-Hispanic Black", "Mexican American", "Other Hispanic", "Other or Multi"))
summary(nhanes$ethnicity)

Citizenship Status

hist(nhanes$DMDCITZN)

# add "citizenship status" to "nhanes"
nhanes$citizenship = factor(ifelse(nhanes$DMDCITZN ==1, "birth or naturalization",
                       ifelse(nhanes$DMDCITZN ==2, "not U,S, citizen", NA)),
                levels=c("birth or naturalization", "not U,S, citizen"))
summary(nhanes$citizenship)

Length of time in the U.S.

hist(nhanes$DMDYRSUS)

# add length of time in the U.S. to "nhanes"
nhanes$years_in_US = factor(ifelse(nhanes$DMDYRSUS ==1, "less than 1 year",
                            ifelse(nhanes$DMDYRSUS ==2, "1 year or more, but less than 5 years",
                            ifelse(nhanes$DMDYRSUS ==3, "5 years or more, but less than 10 years",
                            ifelse(nhanes$DMDYRSUS ==4, "10 years or more, but less than 15 years",
                            ifelse(nhanes$DMDYRSUS ==5, "15 years or more, but less than 20 years",
                            ifelse(nhanes$DMDYRSUS ==6, "20 years or more, but less than 30 years",
                            ifelse(nhanes$DMDYRSUS ==7, "30 years or more, but less than 40 years",
                            ifelse(nhanes$DMDYRSUS ==8, "40 years or more, but less than 50 years",
                            ifelse(nhanes$DMDYRSUS ==9, "50 years or more", NA))))))))))
summary(nhanes$years_in_US)

## not a very good variable, d/t 18,035 "refused," "don't know", or "missing" 

Ratio of Family Income to Poverty Level

hist(nhanes$INDFMPIR)

# add ratio of family income to poverty level
nhanes$income_to_poverty_ratio = factor(ifelse(nhanes$INDFMPIR < 1,"at poverty threshold",
                                        ifelse(nhanes$INDFMPIR >=1 & nhanes$INDFMPIR <2, "family income 2x poverty threshold",
                                        ifelse(nhanes$INDFMPIR >=2 & nhanes$INDFMPIR <3, "family income 3x poverty threshold",
                                        ifelse(nhanes$INDFMPIR >=3 & nhanes$INDFMPIR <4, "family income 4x poverty threshold",
                                        ifelse(nhanes$INDFMPIR >=4 & nhanes$INDFMPIR <5, "family income 5x poverty threshold",
                                        ifelse(nhanes$INDFMPIR ==5, "family income more than 5x poverty threshold", NA)))))),
                                 levels=c("at poverty threshold", "family income 2x poverty threshold", "family income 3x poverty threshold", "family income 4x poverty threshold", "family income 5x poverty threshold", "family income more than 5x poverty threshold"))

summary(nhanes$income_to_poverty_ratio)

Boxplots

# Household reference person education level and Mono-ethyl phthalate
box_edu_ref <- ggplot(data = nhanes, aes(x=log_monoethyl, y=edu_ref, fill=edu_ref)) + 
  scale_fill_brewer(palette="PuBuGn") +
  geom_boxplot() +
  theme(text = element_text(size=12)) +
  xlab("(logged) Mono-Ethyl Phthalate Level (ng/mL)") +
  ylab("Household Reference Person's Education Level") +
ggtitle("Household Ref. Person's Education Level and Logged Phthalate Level")

box_edu_ref

# Family Income and Mono-ethyl phthalate
box_poverty <- ggplot(data = nhanes, aes(x=log_monoethyl, y=income_to_poverty_ratio, fill=income_to_poverty_ratio)) + 
  scale_fill_brewer(palette="PuBuGn") +
  geom_boxplot() +
  theme(text = element_text(size=12)) +
  xlab("(logged) Mono-Ethyl Phthalate Level (ng/mL)") +
  ylab("Ratio of Participant's Family Income to Poverty") +
  ggtitle("Poverty and Logged Phthalate Level")

box_poverty

# Age and Mono-ethyl phthalate
box_age <- ggplot(data = nhanes, aes(x=log_monoethyl, y=age, fill=age)) + 
  scale_fill_brewer(palette="PuBuGn") +
  geom_boxplot() +
  theme(text = element_text(size=12)) +
  xlab("(logged) Mono-Ethyl Phthalate Level (ng/mL)") +
  ylab("Age of Participant") +
  ggtitle("Age and Logged Phthalate Level")

box_age

Suite of Models

# multivariate regression analysis of socio-demographic variables and phthalates
modela <- lm(log_monoethyl ~ edu_ref + age + gender + ethnicity + income_to_poverty_ratio +
                  citizenship, data = nhanes)
summary(modela)

# take out gender
modelb <- lm(log_monoethyl ~ edu_ref + age + ethnicity + income_to_poverty_ratio +
                  citizenship, data = nhanes)
summary(modelb)

# take out gender, citizenship
modelc <- lm(log_monoethyl ~ edu_ref + age + ethnicity + income_to_poverty_ratio, data = nhanes)
summary(modelc)

BIC

The Bayesian Information Criterion (BIC),

BIC_list <- c(BIC(modela), BIC(modelb), BIC(modelc))

model_output <- rbind(data.frame(glance(modela)), data.frame(glance(modelb)), data.frame(glance(modelc))) %>% select(BIC)

model_output <- mutate(model_output, delta.BIC = BIC-min(BIC_list))
model_output$model <- c("Model A", "Model B", "Model C")
model_output <- model_output[,c("model", "BIC", "delta.BIC")]

kable(model_output, format = "markdown", digits = 3, caption = "BIC, and Delta.BIC for the models. Delta BIC > 7 indicates models that should be dismissed from further consideration.")
BIC, and Delta.BIC for the models. Delta BIC > 7 indicates models that should be dismissed from further consideration.
model BIC delta.BIC
Model A 70155.08 6.590
Model B 70148.49 0.000
Model C 70220.62 72.136

Forest Models

Forest model for Model A:

data(nhanes) # Telling R we want to use this data

modela <- lm(log_monoethyl ~ edu_ref + age + gender + ethnicity + income_to_poverty_ratio +
                  citizenship, data = nhanes)
summ(modela)

summ(modela, robust = "HC1") #robust standard errors 

summ(modela, confint = TRUE, digits = 3) #In many cases, you’ll learn more by looking at confidence intervals than p-values. You can request them from summ. default is 95% CIs

summ(modela, confint = TRUE, pvals = FALSE) #DROP the p values all together

# THE GRAPH
plot_summs(modela)

plot_summs(modela, robust = TRUE)

plot_summs(modela, inner_ci_level = .9)

# plot coefficient uncertainty as normal distributions
plot_summs(modela, plot.distributions = TRUE, inner_ci_level = .9)

# table output for Word and RMarkdown documents
## error is in the parenthesis
export_summs(modela, scale = TRUE)

# confidence intervals instead of standard errors
export_summs(modela, scale = TRUE,
             error_format = "[{conf.low}, {conf.high}]")

renaming variables in the forest plot

forest1 <- plot_summs(
        point.size = 3,
        fontsize=8,
        colors = "darkseagreen4",
        modela, coefs = c("Household Education College and Beyond
                                 Partial College and Below (ref)" = "edu_refcollege and beyond", 
                          
                                 "Age: Young Adult
                                 Child (ref)" = "ageyoung adult",
                          
                                 "Age: Middle-aged
                                 Age:Child (ref)" = "agemiddle-aged",
                          
                                 "Age: Older Adult
                                 Age: Child (ref)" = "ageolder adult",
                          
                                 "Gender: Female
                                 Gender: Male (ref)" = "genderfemale",
                          
                                 "Ethnicity: Non-Hispanic Black
                                 Non-Hispanic White (ref)" = "ethnicityNon-Hispanic Black",
                          
                                 "Ethnicity: Mexican American
                                 Non-Hispanic White (ref)" = "ethnicityMexican American",
                          
                                 "Ethnicity: Other Hispanic
                                 Non-Hispanic White (ref)" = "ethnicityOther Hispanic",
                          
                                 "Ethnicity: Other or Multi
                                 Non-Hispanic White (ref)" = "ethnicityOther or Multi",
                          
                                 "Family Income to Poverty Ratio: 2x Poverty threshold
                                 At poverty threshold (ref)" = "income_to_poverty_ratiofamily income 2x poverty threshold",
                          
                                 "Family Income to Poverty Ratio: 3x Poverty threshold
                                 At poverty threshold (ref)" = "income_to_poverty_ratiofamily income 3x poverty threshold",
                          
                                 "Family Income to Poverty Ratio: 4x Poverty threshold
                                 At poverty threshold (ref)" = "income_to_poverty_ratiofamily income 4x poverty threshold",
                          
                                 "Family Income to Poverty Ratio: 5x Poverty threshold
                                 At poverty threshold (ref)" = "income_to_poverty_ratiofamily income 5x poverty threshold",
                          
                                 "Family Income to Poverty Ratio: more than 5x Poverty threshold
                                 At poverty threshold (ref)" = "income_to_poverty_ratiofamily income more than 5x poverty threshold",
                          
                                 "Citizenship Status: Not U.S. Citizen
                                 U.S. Citizen by birth or naturalization (ref)" = "citizenshipnot U,S, citizen"),
        
                          scale = TRUE, robust = TRUE)

forest1

Forest model for Model B: ***

data(nhanes) # Telling R we want to use this data

modelb <- lm(log_monoethyl ~ edu_ref + age + ethnicity + income_to_poverty_ratio +
                  citizenship, data = nhanes)
summ(modelb)

summ(modelb, robust = "HC1") #robust standard errors 

summ(modelb, confint = TRUE, digits = 3) #In many cases, you’ll learn more by looking at confidence intervals than p-values. You can request them from summ. default is 95% CIs

summ(modelb, confint = TRUE, pvals = FALSE) #DROP the p values all together

# THE GRAPH
plot_summs(modelb)

plot_summs(modelb, robust = TRUE)

plot_summs(modelb, inner_ci_level = .9)

# plot coefficient uncertainty as normal distributions
plot_summs(modelb, plot.distributions = TRUE, inner_ci_level = .9)

# table output for Word and RMarkdown documents
## error is in the parenthesis
export_summs(modelb, scale = TRUE)

# confidence intervals instead of standard errors
export_summs(modelb, scale = TRUE,
             error_format = "[{conf.low}, {conf.high}]")

renaming variables in the forest plot 2

forest2 <- plot_summs(
        point.size = 3,
        fontsize=8,
        colors = "darkslategray",
        modelb, coefs = c("Household Education College and Beyond
                                 Partial College and Below (ref)" = "edu_refcollege and beyond", 
                          
                                 "Age: Young Adult
                                 Child (ref)" = "ageyoung adult",
                          
                                 "Age: Middle-aged
                                 Age:Child (ref)" = "agemiddle-aged",
                          
                                 "Age: Older Adult
                                 Age: Child (ref)" = "ageolder adult",
                          
                                 "Ethnicity: Non-Hispanic Black
                                 Non-Hispanic White (ref)" = "ethnicityNon-Hispanic Black",
                          
                                 "Ethnicity: Mexican American
                                 Non-Hispanic White (ref)" = "ethnicityMexican American",
                          
                                 "Ethnicity: Other Hispanic
                                 Non-Hispanic White (ref)" = "ethnicityOther Hispanic",
                          
                                 "Ethnicity: Other or Multi
                                 Non-Hispanic White (ref)" = "ethnicityOther or Multi",
                          
                                 "Family Income to Poverty Ratio: 2x Poverty threshold
                                 At poverty threshold (ref)" = "income_to_poverty_ratiofamily income 2x poverty threshold",
                          
                                 "Family Income to Poverty Ratio: 3x Poverty threshold
                                 At poverty threshold (ref)" = "income_to_poverty_ratiofamily income 3x poverty threshold",
                          
                                 "Family Income to Poverty Ratio: 4x Poverty threshold
                                 At poverty threshold (ref)" = "income_to_poverty_ratiofamily income 4x poverty threshold",
                          
                                 "Family Income to Poverty Ratio: 5x Poverty threshold
                                 At poverty threshold (ref)" = "income_to_poverty_ratiofamily income 5x poverty threshold",
                          
                                 "Family Income to Poverty Ratio: more than 5x Poverty threshold
                                 At poverty threshold (ref)" = "income_to_poverty_ratiofamily income more than 5x poverty threshold",
                          
                                 "Citizenship Status: Not U.S. Citizen
                                 U.S. Citizen by birth or naturalization (ref)" = "citizenshipnot U,S, citizen"),
        
                          scale = TRUE, robust = TRUE)

forest2

Visualize Missing Data

vis_miss(nhanes, sort_miss = TRUE)

References:

  • add a reference to demonstrate rationale for categorizing each variable the way that I do (ex - college completion and above and partial college and below)
  • use resources from GEOG 227